#project website. can you see it? https://josiealford14.github.io/therealcostofdrugs/
library(tidyverse)
library(dplyr)
library(rvest)
library(httr)
library(janitor)
library(stringr)
library(forcats)
library(tidytext)
library(viridis)
library(broom)
library(plotly)

Dataset 1: Drug Prices

# # API code
# url = "https://data.medicaid.gov/resource/tau9-gfwr.csv"
# drug_price =
#   GET(url, query = list("$limit" = 9999999)) %>%
#   content("parsed") %>%
#   clean_names()

# # CSV code
drug_price =
  read_csv("../data/drug_price.csv") %>%
  clean_names()
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   as_of_date = col_date(format = ""),
##   classification_for_rate_setting = col_character(),
##   corresponding_generic_drug_effective_date = col_character(),
##   corresponding_generic_drug_nadac_per_unit = col_character(),
##   effective_date = col_date(format = ""),
##   explanation_code = col_character(),
##   nadac_per_unit = col_double(),
##   ndc = col_character(),
##   ndc_description = col_character(),
##   otc = col_character(),
##   pharmacy_type_indicator = col_character(),
##   pricing_unit = col_character()
## )
drug_price = 
  drug_price %>%
  rename(trade_name = ndc_description,
         drug_type = classification_for_rate_setting,
         unit_price = nadac_per_unit) %>%
  mutate(trade_name = str_replace(trade_name, "[0-9]", "_"),
         trade_name = str_to_title(trade_name),
         otc = recode(otc, "Y" = "yes", "N" = "no"),
         otc = str_to_title(otc),
         otc = as.factor(otc)) %>% 
  separate(trade_name, into = c("trade_name", "remove"), sep = "_") %>% 
  separate(effective_date, into = c("year", "month", "date")) %>%
  select(-c(date, x1, remove)) %>%
  mutate(trade_name = trimws(trade_name, which = c("both"))) %>% 
  separate(trade_name, into = c("trade_name", "remove"), sep = " ") %>% 
  filter(is.na(remove)) %>%
  select(-remove) %>% 
  filter(pricing_unit == "GM" | pricing_unit == "EA") %>% 
  select(trade_name, everything()) 
## Warning: Too few values at 308308 locations: 1, 2, 7, 17, 18, 19, 30, 45,
## 48, 56, 59, 60, 64, 66, 68, 85, 86, 87, 92, 107, ...
## Warning in format.POSIXlt(as.POSIXlt(x), ...): unknown timezone 'zone/tz/
## 2017c.1.0/zoneinfo/America/New_York'
## Warning: Too many values at 592722 locations: 5, 17, 18, 48, 68, 112, 128,
## 136, 138, 173, 196, 215, 222, 231, 235, 241, 265, 273, 279, 294, ...
## Warning: Too few values at 2730099 locations: 1, 3, 4, 6, 8, 9, 10, 11, 12,
## 13, 15, 16, 20, 21, 23, 24, 25, 26, 27, 28, ...
drug_price %>%
  filter(trade_name == "Abacavir") %>%
  ggplot(aes(x = year, y = unit_price)) + geom_violin(aes(fill = year), color = "blue", alpha = .5)

new_drug_price = drug_price %>% 
  group_by(trade_name, drug_type, otc, pricing_unit, year) %>% 
  summarize(avg_price = mean(unit_price, na.rm = TRUE))

new_drug_price
## # A tibble: 6,023 x 6
## # Groups:   trade_name, drug_type, otc, pricing_unit [?]
##    trade_name drug_type    otc pricing_unit  year avg_price
##         <chr>     <chr> <fctr>        <chr> <chr>     <dbl>
##  1                    G    Yes           EA  2013 0.2434375
##  2                    G    Yes           EA  2014 0.2430778
##  3                    G    Yes           EA  2015 0.2556137
##  4                    G    Yes           EA  2016 0.2538263
##  5                    G    Yes           EA  2017 0.2339103
##  6                    G    Yes           GM  2016 0.3077704
##  7                    G    Yes           GM  2017 0.3087167
##  8   Abacavir         G     No           EA  2013 5.3170850
##  9   Abacavir         G     No           EA  2014 4.7696039
## 10   Abacavir         G     No           EA  2015 3.6822746
## # ... with 6,013 more rows

Dataset 2: Funding for Research

url = "https://report.nih.gov/categorical_spending.aspx"
NIH_xml = read_html(url)
NIH_funding = (NIH_xml %>% html_nodes(css = "table"))[[3]] %>% html_table() %>% as_tibble()
NIH_funding = NIH_funding %>%
  clean_names() %>%
  mutate(US_mortality_2015 = x2015_us_mortality_19)

NIH_funding$x2015_us_prevalence_standard_error_19  = recode(NIH_funding$x2015_us_prevalence_standard_error_19 , "-" = "NA")
NIH_funding$US_mortality_2015 = recode(NIH_funding$US_mortality_2015, "-" = "NA")

NIH_funding = NIH_funding %>%
  separate(x2015_us_prevalence_standard_error_19, into = c("prevalence", "remove"), sep = "\\%") %>%
  select(-x2015_us_mortality_19, -remove)
## Warning: Too many values at 36 locations: 7, 19, 21, 24, 25, 35, 37, 40,
## 46, 48, 54, 59, 74, 80, 81, 89, 93, 114, 115, 117, ...
## Warning: Too few values at 246 locations: 1, 2, 3, 4, 5, 6, 8, 9, 10, 11,
## 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, ...
NIH_funding$fy_2013actual = recode(NIH_funding$fy_2013actual, "+" = "NNA")
NIH_funding$fy_2014actual = recode(NIH_funding$fy_2014actual, "+" = "NNA")
NIH_funding$fy_2015actual = recode(NIH_funding$fy_2015actual, "+" = "NNA")
NIH_funding$fy_2016actual = recode(NIH_funding$fy_2016actual, "+" = "NNA")

NIH_funding = NIH_funding %>%
  mutate(fy_2013 = as.numeric(substring(fy_2013actual, 2))) %>%
  mutate(fy_2014 = as.numeric(substring(fy_2014actual, 2))) %>%
  mutate(fy_2015 = as.numeric(substring(fy_2015actual, 2))) %>%
  mutate(fy_2016 = as.numeric(substring(fy_2016actual, 2))) %>%
  mutate(fy_2017 = as.numeric(substring(fy_2017estimated_enacted, 2))) %>%
  mutate(fy_2018estimated = as.numeric(substring(fy_2018estimated, 2))) 
## Warning in evalq(as.numeric(substring(fy_2013actual, 2)), <environment>):
## NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2014actual, 2)), <environment>):
## NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2015actual, 2)), <environment>):
## NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2016actual, 2)), <environment>):
## NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2017estimated_enacted, 2)),
## <environment>): NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2018estimated, 2)),
## <environment>): NAs introduced by coercion
NIH_funding$prevalence = as.numeric(NIH_funding$prevalence)
## Warning: NAs introduced by coercion
NIH_funding$US_mortality_2015 = as.numeric(NIH_funding$US_mortality_2015)
## Warning: NAs introduced by coercion
NIH_funding = NIH_funding %>%
  select(-c(fy_2013actual, fy_2014actual, fy_2015actual, fy_2016actual, fy_2017estimated_enacted))

final_NIH_funding = NIH_funding %>%
  rename(disease = research_disease_areas_dollars_in_millions_and_rounded, 
         '2013' = fy_2013,
         '2014' = fy_2014,
         '2015' = fy_2015,
         '2016' = fy_2016,
         '2017' = fy_2017,
         '2018' = fy_2018estimated) %>%
  gather(key = "year", value = funding, c("2013", "2014", "2015", "2016", "2017", "2018")) %>% 
  separate(disease, into = c("disease", "remove"), sep = " ") %>% 
  select(-remove)
## Warning: Too many values at 666 locations: 1, 2, 3, 4, 6, 7, 10, 11, 12,
## 24, 28, 30, 38, 50, 52, 53, 54, 55, 56, 58, ...
## Warning: Too few values at 384 locations: 5, 8, 13, 14, 17, 18, 19, 21, 23,
## 25, 32, 33, 39, 40, 44, 47, 66, 71, 73, 84, ...
rm(NIH_funding)

final_NIH_funding
## # A tibble: 1,692 x 5
##        disease prevalence US_mortality_2015  year funding
##  *       <chr>      <dbl>             <dbl> <chr>   <dbl>
##  1    Acquired         NA                NA  2013      NA
##  2       Acute         NA                NA  2013      95
##  3  Adolescent         NA               138  2013      70
##  4       Agent         NA                NA  2013      10
##  5       Aging         NA                NA  2013      NA
##  6 Alcoholism,         NA                NA  2013     437
##  7    Allergic        8.2                77  2013       9
##  8         ALS         NA                NA  2013      39
##  9 Alzheimer's         NA                NA  2013     504
## 10 Alzheimer's         NA                NA  2013      NA
## # ... with 1,682 more rows

Dataset 3: Patents

patents = read_csv("./data/patent.csv")
## Parsed with column specification:
## cols(
##   appl_type = col_character(),
##   appl_no = col_integer(),
##   product_no = col_integer(),
##   patent_no = col_character(),
##   patexpstr = col_character(),
##   patexp = col_character(),
##   drugsub = col_character(),
##   drugprod = col_character(),
##   patuse = col_character(),
##   delist = col_character()
## )
patents = patents %>% 
  clean_names() %>%
  rename(drug_type = appl_type) %>% 
  mutate(drug_type = recode(drug_type, "N" = "B")) %>%
  separate(patexp, into = c("month_pat_expires", "day_pat_expires", "year_pat_expires"), sep = "/") %>%
  separate(patent_no, into = c("patent_num", "pediatric_exclusivity_granted"), sep = "\\*")
## Warning: Too few values at 12167 locations: 1, 2, 3, 5, 7, 9, 10, 11, 12,
## 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, ...
patents$pediatric_exclusivity_granted[is.na(patents$pediatric_exclusivity_granted)] = "no"
patents$drugsub[is.na(patents$drugsub)] = "no"
patents$drugprod[is.na(patents$drugprod)] = "no"
patents$delist[is.na(patents$delist)] = "no"

patents = patents %>%
  mutate(pediatric_exclusivity_granted = factor(pediatric_exclusivity_granted, levels = c("no", "PED"), labels = c("No", "Yes"))) %>%
  mutate(drug_substance_flag = factor(drugsub, levels = c("no", "Y"), labels = c("no", "yes"))) %>%
  mutate(drug_prod_flag = factor(drugprod, levels = c("no", "Y"), labels = c("no", "yes"))) %>%
  mutate(patent_delisted = factor(delist, levels = c("no", "Y"), labels = c("no", "yes"))) %>%
  select(-drugsub, -drugprod, -patexpstr) %>%
  rename(pat_use_code = patuse)
patents
## # A tibble: 13,858 x 13
##    drug_type appl_no product_no patent_num pediatric_exclusivity_granted
##        <chr>   <int>      <int>      <chr>                        <fctr>
##  1         B   22425          1    5223510                            No
##  2         B   22425          1    5223510                            No
##  3         B   22307          1    5288726                            No
##  4         B   22307          1    5288726                           Yes
##  5         B   21462          1    5344932                            No
##  6         B   21462          1    5344932                           Yes
##  7         B   22088          1    5362718                            No
##  8         B   22088          1    5362718                           Yes
##  9         B   21060          1    5364842                            No
## 10         B   21060          1    5364842                            No
## # ... with 13,848 more rows, and 8 more variables:
## #   month_pat_expires <chr>, day_pat_expires <chr>,
## #   year_pat_expires <chr>, pat_use_code <chr>, delist <chr>,
## #   drug_substance_flag <fctr>, drug_prod_flag <fctr>,
## #   patent_delisted <fctr>

Dataset 4: Products

*The Products dataset from the FDA Orange Book

*Type: The group or category of approved drugs. RX (prescription), OTC (over the counter), or DISCN (discontinued).

*Applicant Full Name: The full name of the firm holding legal responsibility for the new drug application.

products = read_csv("./data/products.csv")
## Parsed with column specification:
## cols(
##   ingredient = col_character(),
##   dfroute = col_character(),
##   doseform = col_character(),
##   route = col_character(),
##   trade_name = col_character(),
##   applicant = col_character(),
##   strength = col_character(),
##   appl_type = col_character(),
##   appl_no = col_integer(),
##   product_no = col_integer(),
##   tecodestr = col_character(),
##   tecode = col_integer(),
##   appdatestr = col_character(),
##   appdate = col_character(),
##   rld = col_character(),
##   rs = col_character(),
##   type = col_character(),
##   appfull = col_character()
## )
products = products %>%
  rename(drug_type = appl_type,
         otc = type) %>% 
  mutate(drug_type = recode(drug_type, "A" = "G", "N" = "B"),
         otc = recode(otc, "OTC" = "yes", "RX" = "no", "DISCN" = "no")) %>%
  separate(appdate, into = c("approval_month", "approval_day", "approval_year"), sep = "/") %>%
  #separate(doseform, into = c("form", "release_type"), sep = ",")
  mutate(otc = factor(otc)) %>%
  mutate(ref_listed_drug = factor(rld)) %>%
  mutate(ref_std = factor(rs)) %>%
  select(-appdatestr, -rld, -rs, -dfroute)
products
## # A tibble: 34,209 x 18
##                                           ingredient doseform route
##                                                <chr>    <chr> <chr>
##  1                                  ABACAVIR SULFATE SOLUTION  ORAL
##  2                                  ABACAVIR SULFATE SOLUTION  ORAL
##  3                                  ABACAVIR SULFATE   TABLET  ORAL
##  4                                  ABACAVIR SULFATE   TABLET  ORAL
##  5                                  ABACAVIR SULFATE   TABLET  ORAL
##  6                                  ABACAVIR SULFATE   TABLET  ORAL
##  7                                  ABACAVIR SULFATE   TABLET  ORAL
##  8                                  ABACAVIR SULFATE   TABLET  ORAL
##  9 ABACAVIR SULFATE; DOLUTEGRAVIR SODIUM; LAMIVUDINE   TABLET  ORAL
## 10                      ABACAVIR SULFATE; LAMIVUDINE   TABLET  ORAL
## # ... with 34,199 more rows, and 15 more variables: trade_name <chr>,
## #   applicant <chr>, strength <chr>, drug_type <chr>, appl_no <int>,
## #   product_no <int>, tecodestr <chr>, tecode <int>, approval_month <chr>,
## #   approval_day <chr>, approval_year <chr>, otc <fctr>, appfull <chr>,
## #   ref_listed_drug <fctr>, ref_std <fctr>

Dataset 5: Exclusivity

We load and tidy the exclusivity dataset

exclusivity = read_csv("./data/exclusivity.csv") %>%
  clean_names() %>%
  rename(drug_type = appl_type) %>% 
  mutate(drug_type = recode(drug_type, "A" = "G", "N" = "B")) %>% 
  separate(excldate, into = c("month_excl_expires", "date_excl_expires", "year_excl_expires"))
## Parsed with column specification:
## cols(
##   appl_type = col_character(),
##   appl_no = col_integer(),
##   product_no = col_integer(),
##   exclcode = col_character(),
##   excldatestr = col_character(),
##   excldate = col_character()
## )
exclusivity
## # A tibble: 1,806 x 8
##    drug_type appl_no product_no exclcode  excldatestr month_excl_expires
##  *     <chr>   <int>      <int>    <chr>        <chr>              <chr>
##  1         G   65488          1       PC Mar 26, 2018                  3
##  2         G   65488          2       PC Mar 26, 2018                  3
##  3         G   78276          1       PC Apr 24, 2017                  4
##  4         G   78276          2       PC Apr 24, 2017                  4
##  5         G   78276          3       PC Apr 24, 2017                  4
##  6         G   78340          1       PC Jul 30, 2016                  7
##  7         G   78340          2       PC Jul 30, 2016                  7
##  8         G   78560          1       PC Jun 10, 2017                  6
##  9         G   78827          1       PC Apr 24, 2017                  4
## 10         G   78827          2       PC Apr 24, 2017                  4
## # ... with 1,796 more rows, and 2 more variables: date_excl_expires <chr>,
## #   year_excl_expires <chr>

The tidied exclusivity dataset has 1806 observations and 8 variables:

Dataset 6: Ingredient

We load the ingredient dataset.

ingredient = read_csv("./data/ingredient.csv") %>%
  clean_names() %>%
  select(singnum, singredient, everything())
## Parsed with column specification:
## cols(
##   ingredient = col_character(),
##   singnum = col_integer(),
##   singredient = col_character()
## )
ingredient
## # A tibble: 3,371 x 3
##    singnum         singredient
##      <int>               <chr>
##  1       1    ABACAVIR SULFATE
##  2       1    ABACAVIR SULFATE
##  3       2 DOLUTEGRAVIR SODIUM
##  4       3          LAMIVUDINE
##  5       1    ABACAVIR SULFATE
##  6       2          LAMIVUDINE
##  7       1    ABACAVIR SULFATE
##  8       2          LAMIVUDINE
##  9       3          ZIDOVUDINE
## 10       1       ABALOPARATIDE
## # ... with 3,361 more rows, and 1 more variables: ingredient <chr>

The tidied ingredient dataset has 3371 observations and 3 variables:

Dataset 7: Drug by Condition

Scraping data on drugs per condition from this website

urls = paste("https://www.centerwatch.com/drug-information/fda-approved-drugs/medical-conditions/", toupper(letters), sep = "")

get_drug_disease = function(url) {
  
  disease_data = read_html(url) %>% 
  html_nodes(css = ".ToggleDrugCategory") %>%
  html_text() %>% 
  as_tibble() %>% 
  mutate(key = value) %>% 
  rename(disease = value)
  
  drug_data = read_html(url) %>% 
  html_nodes(css = ".CategoryListSection a:nth-child(1)") %>%
  html_text() %>% 
  as_tibble() %>% 
  mutate(key = value) %>% 
  rename(drug = value)
  
  drug_disease_data = read_html(url) %>% 
  html_nodes(css = "#MainColumn a:nth-child(1)") %>%
  html_text() %>% 
  as_tibble() %>% 
  mutate(key = value) %>% 
  rename(drug_disease = value)
  
  drug_disease_joined = left_join(drug_disease_data, drug_data, disease_data, by = "key") %>% 
  left_join(., disease_data, by = "key") %>%
  fill(disease) %>%
  select(-c(drug,drug_disease))
  
  toremove = drug_disease_joined %>% distinct(disease) %>% pull()
  
  drug_disease_joined = drug_disease_joined %>% 
  filter(!key %in% toremove) %>% 
  rename(drug = key) %>% 
  .[-c(1:29),]
  
  drug_disease_joined
}

drug_disease_data = map_df(urls, get_drug_disease) %>% 
  mutate(disease = str_to_title(disease),
         drug = str_to_title(drug)) %>% 
  rename(trade_name = drug) %>%
  mutate(trade_name = trimws(trade_name, which = c("both"))) %>% 
  separate(trade_name, into = c("trade_name", "remove"), sep = " ")
## Warning: Too many values at 959 locations: 2, 3, 4, 5, 6, 8, 9, 10, 11, 12,
## 13, 14, 16, 18, 19, 21, 23, 24, 25, 26, ...
## Warning: Too few values at 139 locations: 7, 64, 68, 76, 109, 122, 132,
## 144, 145, 161, 162, 173, 174, 188, 197, 207, 221, 223, 228, 231, ...
drug_disease_data
## # A tibble: 1,881 x 3
##     trade_name         remove disease
##  *       <chr>          <chr>   <chr>
##  1       Avita            Gel    Acne
##  2  Benzamycin  (Erythromycin    Acne
##  3 Clindamycin      Phosphate    Acne
##  4 Clindamycin      Phosphate    Acne
##  5    Differin     (Adapalene    Acne
##  6   Estrostep (Norethindrone    Acne
##  7     Finevin           <NA>    Acne
##  8      Klaron        (Sodium    Acne
##  9       Ortho     Tri-Cyclen    Acne
## 10     Retin-A          Micro    Acne
## # ... with 1,871 more rows

The drug_disease_data contains 1881 observations and 3 variables:

Combine Datasets into one

#A: join datasets by ingredient for ingredient and products (left join)
#products = longer dataset + ingredient
product_ingredient = left_join(products, ingredient, by = "ingredient")
product_ingredient
## # A tibble: 40,443 x 20
##                                           ingredient doseform route
##                                                <chr>    <chr> <chr>
##  1                                  ABACAVIR SULFATE SOLUTION  ORAL
##  2                                  ABACAVIR SULFATE SOLUTION  ORAL
##  3                                  ABACAVIR SULFATE   TABLET  ORAL
##  4                                  ABACAVIR SULFATE   TABLET  ORAL
##  5                                  ABACAVIR SULFATE   TABLET  ORAL
##  6                                  ABACAVIR SULFATE   TABLET  ORAL
##  7                                  ABACAVIR SULFATE   TABLET  ORAL
##  8                                  ABACAVIR SULFATE   TABLET  ORAL
##  9 ABACAVIR SULFATE; DOLUTEGRAVIR SODIUM; LAMIVUDINE   TABLET  ORAL
## 10 ABACAVIR SULFATE; DOLUTEGRAVIR SODIUM; LAMIVUDINE   TABLET  ORAL
## # ... with 40,433 more rows, and 17 more variables: trade_name <chr>,
## #   applicant <chr>, strength <chr>, drug_type <chr>, appl_no <int>,
## #   product_no <int>, tecodestr <chr>, tecode <int>, approval_month <chr>,
## #   approval_day <chr>, approval_year <chr>, otc <fctr>, appfull <chr>,
## #   ref_listed_drug <fctr>, ref_std <fctr>, singnum <int>,
## #   singredient <chr>
#B: exclusivity and patents left join (more~less)
patent_exclusivity = left_join(patents, exclusivity, by = c("appl_no", "product_no", "drug_type"))
patent_exclusivity
## # A tibble: 22,379 x 18
##    drug_type appl_no product_no patent_num pediatric_exclusivity_granted
##        <chr>   <int>      <int>      <chr>                        <fctr>
##  1         B   22425          1    5223510                            No
##  2         B   22425          1    5223510                            No
##  3         B   22307          1    5288726                            No
##  4         B   22307          1    5288726                            No
##  5         B   22307          1    5288726                           Yes
##  6         B   22307          1    5288726                           Yes
##  7         B   21462          1    5344932                            No
##  8         B   21462          1    5344932                           Yes
##  9         B   22088          1    5362718                            No
## 10         B   22088          1    5362718                           Yes
## # ... with 22,369 more rows, and 13 more variables:
## #   month_pat_expires <chr>, day_pat_expires <chr>,
## #   year_pat_expires <chr>, pat_use_code <chr>, delist <chr>,
## #   drug_substance_flag <fctr>, drug_prod_flag <fctr>,
## #   patent_delisted <fctr>, exclcode <chr>, excldatestr <chr>,
## #   month_excl_expires <chr>, date_excl_expires <chr>,
## #   year_excl_expires <chr>
#C: join A and B left join: "Orange Book." Thursday: meet to eliminate variables (inner join: only commonalities are preserved) #eliminate drugs that do not match
orange_book = left_join(product_ingredient, patent_exclusivity, by = c("appl_no", "product_no", "drug_type"))

orange_book = orange_book %>%
  mutate(ingredient = str_to_title(ingredient),
         doseform = str_to_title(doseform),
         route = str_to_title(route),
         trade_name = str_to_title(trade_name),
         applicant = str_to_title(applicant),
         otc = str_to_title(otc),
         appfull = str_to_title(appfull),
         singredient = str_to_title(singredient),
         strength = str_to_title(strength))
orange_book
## # A tibble: 64,423 x 35
##          ingredient doseform route       trade_name            applicant
##               <chr>    <chr> <chr>            <chr>                <chr>
##  1 Abacavir Sulfate Solution  Oral Abacavir Sulfate  Hetero Labs Ltd Iii
##  2 Abacavir Sulfate Solution  Oral           Ziagen        Viiv Hlthcare
##  3 Abacavir Sulfate Solution  Oral           Ziagen        Viiv Hlthcare
##  4 Abacavir Sulfate Solution  Oral           Ziagen        Viiv Hlthcare
##  5 Abacavir Sulfate Solution  Oral           Ziagen        Viiv Hlthcare
##  6 Abacavir Sulfate   Tablet  Oral Abacavir Sulfate           Apotex Inc
##  7 Abacavir Sulfate   Tablet  Oral Abacavir Sulfate Aurobindo Pharma Ltd
##  8 Abacavir Sulfate   Tablet  Oral Abacavir Sulfate  Hetero Labs Ltd Iii
##  9 Abacavir Sulfate   Tablet  Oral Abacavir Sulfate     Mylan Pharms Inc
## 10 Abacavir Sulfate   Tablet  Oral Abacavir Sulfate       Strides Pharma
## # ... with 64,413 more rows, and 30 more variables: strength <chr>,
## #   drug_type <chr>, appl_no <int>, product_no <int>, tecodestr <chr>,
## #   tecode <int>, approval_month <chr>, approval_day <chr>,
## #   approval_year <chr>, otc <chr>, appfull <chr>, ref_listed_drug <fctr>,
## #   ref_std <fctr>, singnum <int>, singredient <chr>, patent_num <chr>,
## #   pediatric_exclusivity_granted <fctr>, month_pat_expires <chr>,
## #   day_pat_expires <chr>, year_pat_expires <chr>, pat_use_code <chr>,
## #   delist <chr>, drug_substance_flag <fctr>, drug_prod_flag <fctr>,
## #   patent_delisted <fctr>, exclcode <chr>, excldatestr <chr>,
## #   month_excl_expires <chr>, date_excl_expires <chr>,
## #   year_excl_expires <chr>
# final goal: disease, funding, drug prices, and Orange Book

# eliminate unnecessary variables from orange book dataset
orange_book_clean = orange_book %>%
  rename(application_no = appl_no, 
         active_ingredient = singredient, 
         dose_form = doseform) %>%
  select(-c(ingredient, applicant, product_no, tecodestr, tecode, approval_day, ref_std, singnum, day_pat_expires, pat_use_code, drug_substance_flag, drug_prod_flag, exclcode, excldatestr, date_excl_expires)) %>% 
  select(trade_name, everything()) %>% 
  separate(dose_form, into = c("dose_form", "remove")) %>%
  filter(is.na(remove)) %>% 
  select(-remove) %>% 
  filter(str_detect(dose_form, regex("Tablet", ignore_case = TRUE))) %>%
  mutate(trade_name = trimws(trade_name, which = c("both")),
         trade_name = str_replace_all(trade_name, ", ", "-"),
         trade_name = str_replace_all(trade_name, " And ", "-"))
## Warning: Too many values at 8293 locations: 78, 79, 80, 81, 82, 153, 154,
## 155, 156, 157, 158, 159, 634, 635, 636, 1085, 1086, 1087, 1088, 1089, ...
## Warning: Too few values at 51953 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
# inner join of disease drug and orange book =y by trade_name
final_dataset =
  new_drug_price %>%
  inner_join(., orange_book_clean, by = c("trade_name", "drug_type", "otc")) %>% 
  distinct() %>% 
  left_join(., drug_disease_data, by = "trade_name") %>% 
  distinct() %>% 
  separate(disease, into = c("disease", "remove"), sep = " ") %>% 
  left_join(., final_NIH_funding, by = c("disease", "year")) %>% 
  as_tibble() %>% 
  select(-c(remove, remove.x)) %>%
  distinct()
## Warning: Column `otc` joining factor and character vector, coercing into
## character vector
## Warning: Too many values at 11500 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
## Warning: Too few values at 4247 locations: 1366, 1368, 1370, 1372, 1374,
## 1376, 1459, 1460, 1462, 1463, 1465, 1466, 1468, 1469, 1471, 1472, 1474,
## 1475, 1477, 1478, ...

Josie

# Remomve some unecessary datasets
rm(patent_exclusivity, product_ingredient)

Manali

#do rare diseases recieve less funding?
#set.seed(1)
#final_dataset %>%
#  sample_n(50)%>%
#plot_ly(x~funding, y~prevalence, type = "scatter")

#does funding correlate with drug prices?

#final_dataset %>%
#  sample_n(50)%>%
#  plot_ly(x ~ price, y ~ funding, type = "scatter")

Muhire

We created a spaghetti plot of the average price of drugs per year for each drug. We only used data on prices per drug unit (mainly per tablet). Information about the disease that the drugs treat can be seen by hovering over the plot. We notice that of the drugs with information on price per unit drug, the drugs for Hepatitis seem to have the highest cost. Unfortunately we cannot tell which kind of hepatitis these drugs treat.

Possible case studies on most popular (perceived) drugs: Lipitor, Xanax, Celebrex, Prozac, Crestor, Nexium, Lisinopril, Zoloft

final_dataset %>% 
  filter(pricing_unit == "EA" & !is.na(disease)) %>% 
  plot_ly(x = ~year, y = ~avg_price, type = "scatter", mode = "lines", text = ~paste("Drug: ", trade_name, '<br>Disease:', disease), color = ~trade_name) %>% 
  layout(showlegend = FALSE)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

We also created a scatter plot of the mean NIH funding per drug (assumed to be the same as the NIH funding for disease research) against the demand of the drug (defined as the 2015 disease prevalence) colored by disease. Among drugs with available information on price per unit drug, obesity seems to have received the most funding on average and to have the highest 2015 prevalence.

final_dataset %>% 
  filter(pricing_unit == "EA" & !is.na(funding) & !is.na(prevalence)) %>% 
  group_by(trade_name, disease, prevalence) %>% 
  summarize(mean_fund = mean(funding)) %>% 
  plot_ly(x = ~prevalence, y = ~mean_fund, type = "scatter", alpha = 1, text = ~paste("Drug: ", trade_name, '<br>Disease:', disease), color = ~disease) %>%
  layout(showlegend = FALSE)
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

MeOak